Data Preparation

Import Dataset

Data Preprocess

Defining Preprocess Recipe

# define preprocess recipe from train dataset
rec <- recipe(sentiment ~ ., data = training(splitted)) %>% 
  step_rm(-sentiment, -tweet) %>%
  step_string2factor(sentiment, levels = c("negative", "neutral", "positive")) %>%
  step_downsample(sentiment, ratio = 1/1, seed = 100) %>%
  step_mutate(tweet = str_squish(tweet)) %>% 
  step_mutate(tweet = replace_html(tweet, symbol = FALSE)) %>% 
  step_mutate(tweet = replace_kern(tweet)) %>% 
  step_mutate(tweet = replace_word_elongation(tweet)) %>% 
  step_mutate(tweet = replace_date(tweet, replacement = "datewords")) %>% 
  step_mutate(tweet = replace_time(tweet, replacement = "timewords")) %>% 
  step_mutate(tweet = replace_money(tweet, replacement = "moneywords")) %>% 
  step_mutate(tweet = replace_ordinal(tweet, remove = FALSE)) %>% 
  step_mutate(tweet = replace_number(tweet, remove = FALSE)) %>% 
  step_mutate(tweet = replace_internet_slang(tweet)) %>% 
  step_mutate(tweet = replace_contraction(tweet)) %>% 
  step_mutate(tweet = replace_emoji(tweet)) %>% 
  step_mutate(tweet = replace_symbol(tweet)) %>% 
  step_mutate(tweet = str_squish(tweet)) %>% 
  step_mutate(tweet = str_replace_all(tweet, "(<.*>)", "")) %>% 
  step_mutate(tweet = str_replace_all(tweet, "[:digit:]", "")) %>% 
  step_tokenize(tweet, token = "words") %>%
  step_stem(tweet) %>%
  step_stopwords(tweet) %>%
  step_tokenfilter(tweet, max_tokens = 256) %>%
  step_tfidf(tweet) %>%
  prep(string_as_factor = FALSE)

# get train and test dataset
data_train <- juice(rec)
data_test <- bake(rec, testing(splitted))

# quick check
head(juice(rec), 10)

Model Fitting

Defining Model Specifications

#> Random Forest Model Specification (classification)
#> 
#> Main Arguments:
#>   mtry = 3
#>   trees = 500
#>   min_n = 1
#> 
#> Engine-Specific Arguments:
#>   seed = 100
#>   num.threads = parallel::detectCores()/2
#>   importance = impurity
#> 
#> Computational engine: ranger

Model Fitting

#> parsnip model object
#> 
#> Ranger result
#> 
#> Call:
#>  ranger::ranger(formula = formula, data = data, mtry = ~3, num.trees = ~500,      min.node.size = ~1, seed = ~100, num.threads = ~parallel::detectCores()/2,      importance = ~"impurity", verbose = FALSE, probability = TRUE) 
#> 
#> Type:                             Probability estimation 
#> Number of trees:                  500 
#> Sample size:                      5661 
#> Number of independent variables:  256 
#> Mtry:                             3 
#> Target node size:                 1 
#> Variable importance mode:         impurity 
#> Splitrule:                        gini 
#> OOB prediction error (Brier s.):  0.2962273

Model Evaluation

Predict on Test Dataset

Confusion Matrix

ROC Curve

Precision-Recall Curve